import pandas as pd

df = pd.read_csv("AB_NYC_2019.csv")

df.head()

df["id"]=df["id"].astype(str)
df["host_id"]=df["host_id"].astype(str)
df["latitude"]=df["latitude"].astype(str)
df["longitude"]=df["longitude"].astype(str)

df.describe()

df.nunique()

id                                48895
name                              47896
host_id                           37457
host_name                         11452
neighbourhood_group                   5
neighbourhood                       221
latitude                          19048
longitude                         14718
room_type                             3
price                               674
minimum_nights                      109
number_of_reviews                   394
last_review                        1764
reviews_per_month                   937
calculated_host_listings_count       47
availability_365                    366
dtype: int64

df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

df["room_type"].value_counts()

room_type
Entire home/apt    25414
Private room       22332
Shared room         1160
Name: count, dtype: int64

df["room_type"].value_counts(normalize = True)

room_type
Entire home/apt    0.519650
Private room       0.456631
Shared room        0.023719
Name: proportion, dtype: float64

# df["neighbourhood_group"].value_counts(normalize=True)*100
percentage_counts = df["neighbourhood_group"].value_counts(normalize=True) * 100
print(percentage_counts.map("{:.3f}%".format))

neighbourhood_group
Manhattan        44.307%
Brooklyn         41.114%
Queens           11.585%
Bronx             2.231%
Staten Island     0.763%
Name: proportion, dtype: object

df["neighbourhood"].value_counts().reset_index().rename(columns = {"count" : "No. of Hotels"})

df["price"].value_counts(bins=5)

(-10.001, 2000.0]    48820
(2000.0, 4000.0]        54
(4000.0, 6000.0]        16
(6000.0, 8000.0]         9
(8000.0, 10000.0]        7
Name: count, dtype: int64

bins = [-10,0, 50,100, 200,500,800,2000,4000,10000]
df["price"].value_counts(bins = bins)

(50.0, 100.0]        17373
(100.0, 200.0]       16588
(200.0, 500.0]        7340
(0.0, 50.0]           6550
(500.0, 800.0]         624
(800.0, 2000.0]        334
(2000.0, 4000.0]        54
(4000.0, 10000.0]       32
(-10.001, 0.0]          11
Name: count, dtype: int64

df["price"].mean()

152.71132376395533

df["price"].median()

106.0

df["price"].std()

240.1287131622509

df["minimum_nights"].mean()

7.031611663190611

df["minimum_nights"].median()

3.0

df["price"].skew()

19.120831694826197

df["price"].kurt() ## This tells the height of the price data

585.7930484394186

df[df["availability_365"]==365].shape[0]

1295

df.corr(numeric_only=True) 
#The main task of the DataFrame.corr() method is to find the pairwise correlation of all the columns in the DataFrame.
# If any null value is present, it will automatically be excluded. It also ignores non-numeric data type columns from the DataFrame.

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
0	2539	Clean & quiet apt home by the park	2787	John	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	19-10-2018	0.21	6	365
1	2595	Skylit Midtown Castle	2845	Jennifer	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	21-05-2019	0.38	2	355
2	3647	THE VILLAGE OF HARLEM....NEW YORK !	4632	Elisabeth	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	NaN	1	365
3	3831	Cozy Entire Floor of Brownstone	4869	LisaRoxanne	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	05-07-2019	4.64	1	194
4	5022	Entire Apt: Spacious Studio/Loft by central park	7192	Laura	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	19-11-2018	0.10	1	0

	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365
count	48906.000000	48906.000000	48906.000000	38854.000000	48906.000000	48906.000000
mean	152.711324	7.031612	23.300454	1.373151	7.142702	112.782031
std	240.128713	20.512489	44.607175	1.680270	32.948926	131.620370
min	0.000000	1.000000	0.000000	0.010000	1.000000	0.000000
25%	69.000000	1.000000	1.000000	0.190000	1.000000	0.000000
50%	106.000000	3.000000	5.000000	0.720000	1.000000	45.000000
75%	175.000000	5.000000	24.000000	2.020000	2.000000	227.000000
max	10000.000000	1250.000000	629.000000	58.500000	327.000000	365.000000

	neighbourhood	No. of Hotels
0	Williamsburg	3921
1	Bedford-Stuyvesant	3715
2	Harlem	2658
3	Bushwick	2465
4	Upper West Side	1974
...	...	...
216	Fort Wadsworth	1
217	Richmondtown	1
218	New Dorp	1
219	Rossville	1
220	Willowbrook	1

	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365
price	1.000000	0.042771	-0.048014	-0.030608	0.057478	0.081817
minimum_nights	0.042771	1.000000	-0.080093	-0.121772	0.127917	0.144146
number_of_reviews	-0.048014	-0.080093	1.000000	0.549291	-0.072375	0.172002
reviews_per_month	-0.030608	-0.121772	0.549291	1.000000	-0.009414	0.185818
calculated_host_listings_count	0.057478	0.127917	-0.072375	-0.009414	1.000000	0.225680
availability_365	0.081817	0.144146	0.172002	0.185818	0.225680	1.000000

Categorical Data¶

Numerical Data¶

Measures of central tendency¶

Measure of Spread¶